{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Multi-Perspective Sentence Similarity Modeling with Convolutional Neural Networks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "DATA_PATH = \"./data/\"\n",
    "TRAIN_PATH = DATA_PATH + \"train.csv\"\n",
    "TEST_PATH = DATA_PATH + \"test.csv\"\n",
    "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n",
    "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n",
    "QUEST_PATH = DATA_PATH + \"question.csv\"\n",
    "\n",
    "train_data = pd.read_csv(TRAIN_PATH)\n",
    "test_data = pd.read_csv(TEST_PATH)\n",
    "question_data = pd.read_csv(QUEST_PATH)\n",
    "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
    "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
    "\n",
    "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n",
    "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n",
    "\n",
    "label = train_data[\"label\"].values\n",
    "\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "\n",
    "MAX_COUNT = 10000\n",
    "\n",
    "word_tokenizer = Tokenizer(MAX_COUNT)\n",
    "word_tokenizer.fit_on_texts(question_data[\"words\"])\n",
    "\n",
    "word_embedding_data = np.concatenate(\n",
    "    (\n",
    "        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n",
    "        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
    "    ),\n",
    "    axis=0\n",
    ")\n",
    "\n",
    "char_tokenizer = Tokenizer(MAX_COUNT)\n",
    "char_tokenizer.fit_on_texts(question_data[\"chars\"])\n",
    "\n",
    "char_embedding_data = np.concatenate(\n",
    "    (\n",
    "        np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),\n",
    "        char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
    "    ),\n",
    "    axis=0\n",
    ")\n",
    "\n",
    "word_embedding_data.shape, char_embedding_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from keras.preprocessing.sequence import pad_sequences\n",
    "\n",
    "SEQ_LEN = 30\n",
    "\n",
    "def gen_word_data(data):\n",
    "    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n",
    "    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n",
    "    return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
    "        pad_sequences(seq_word2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
    "    \n",
    "def gen_char_data(data):\n",
    "    seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"chars\"])\n",
    "    seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"chars\"])\n",
    "    return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
    "        pad_sequences(seq_char2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
    "\n",
    "word1, word2 = gen_word_data(train_data)\n",
    "char1, char2 = gen_char_data(train_data)\n",
    "test_word1, test_word2 = gen_word_data(test_data)\n",
    "test_char1, test_char2 = gen_char_data(test_data)\n",
    "\n",
    "word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from keras.models import Model\n",
    "from keras.layers.merge import concatenate\n",
    "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
    "from keras.layers import LSTM, Bidirectional, TimeDistributed\n",
    "from keras.layers import Conv1D, MaxPool1D, GlobalAveragePooling1D\n",
    "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K, Activationation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train_word1, dev_word1, train_word2, dev_word2, train_y, dev_y = train_test_split(\n",
    "    word1, word2, train_data[\"label\"].values,\n",
    "    test_size=0.2\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "embedding_layer = Embedding(\n",
    "        input_dim=word_embedding_data.shape[0],  # word/char switch\n",
    "        output_dim=word_embedding_data.shape[1],  # word/char switch\n",
    "        weights=[word_embedding_data],  # word/char switch\n",
    "        input_length=SEQ_LEN,\n",
    "        trainable=False\n",
    "    )\n",
    "\n",
    "vector1 = embedding_layer(input1)\n",
    "vector2 = embedding_layer(input2)\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
